This example uses the tidyverse suite of packages.

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

Read data

The code chunk below reads in the final project data.

df <- readr::read_csv("paint_project_train_data.csv", col_names = TRUE)
## Rows: 835 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Lightness, Saturation
## dbl (6): R, G, B, Hue, response, outcome
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_scaled <- df %>%
  select(R, G, B, Hue, Saturation, Lightness, outcome) %>% 
  mutate(outcome = ifelse(outcome == 1, 'event', 'non_event'),
         outcome = factor(outcome, levels = c('event', 'non_event')))
set.seed(1234)

Comparing models on accuracy

my_ctrl_acc <- trainControl(method = 'repeatedcv', number = 5, repeats = 3, savePredictions = 'all')
my_metrics_acc <- "Accuracy"
formulas <- list('outcome ~ .',
                 'outcome ~ Lightness + Saturation + (R + G + B + Hue)^2',
                 'outcome ~ (Lightness + Saturation)*(R + G + B + Hue)*(R + G + B + Hue)',
                 'outcome ~ (Lightness + Saturation + Hue)*(R + G + B)')
formula_to_glm_model <- function(form, met, ctrl) {
  caret::train(form %>% as.formula(),
        data = df_scaled,
        method = 'glm',
        preProcess = c('center', 'scale'),
        metric = met,
        trControl = ctrl)
}
glm_default_acc <- tibble::tibble(
  formula = formulas,
  models = purrr::map(formulas, formula_to_glm_model, my_metrics_acc, my_ctrl_acc)
)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
formula_to_enet_model <- function(form, met, ctrl) {
  caret::train(form %>% as.formula(),
        data = df_scaled,
        method = 'glmnet',
        preProcess = c('center', 'scale'),
        metric = met,
        trControl = ctrl)
}
enet_default_acc <- tibble::tibble(
  formula = formulas[2:3],
  models = purrr::map(formula, formula_to_enet_model, my_metrics_acc, my_ctrl_acc)
)
## Warning: from glmnet C++ code (error code -100); Convergence for 100th lambda
## value not reached after maxit=100000 iterations; solutions for larger lambdas
## returned
formula_to_enet_tune_model <- function(form, enet_default, met, ctrl) {
  my_lambda_grid <- exp(seq(log(min(enet_default$results$lambda)),
                          log(max(enet_default$results$lambda)),
                          length.out = 25))
  enet_grid <- expand.grid(alpha = seq(0.1, 1, by = 0.1),
                           lambda = my_lambda_grid)
  caret::train(form %>% as.formula(),
        data = df_scaled,
        method = 'glmnet',
        preProcess = c('center', 'scale'),
        tuneGrid = enet_grid,
        metric = met,
        trControl = ctrl)
}
enet_tune_acc <- tibble::tibble(
  formula = formulas[2:3],
  models = purrr::map2(formula, enet_default_acc$models, formula_to_enet_tune_model, my_metrics_acc, my_ctrl_acc, .progress = TRUE)
)
## â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â–  50% | ETA: 23s
## Warning: from glmnet C++ code (error code -100); Convergence for 100th lambda
## value not reached after maxit=100000 iterations; solutions for larger lambdas
## returned
## 
plot(enet_tune_acc$models[[1]], xTrans = log)

plot(enet_tune_acc$models[[2]], xTrans = log)

plot( varImp(enet_tune_acc$models[[1]]) )

plot( varImp(enet_tune_acc$models[[2]]), top = 20 )

nnet_default_acc <- train( outcome ~ .,
                       data = df_scaled,
                       method = 'nnet',
                       metric = my_metrics_acc,
                       preProcess = c("center", "scale"),
                       trControl = my_ctrl_acc,
                       trace = FALSE)
nnet_grid <- expand.grid(size = c(5, 10, 20),
                         decay = exp(seq(-6, 2, length.out=11)))
nnet_tune_acc <- train( outcome ~ .,
                       data = df_scaled,
                       method = 'nnet',
                       metric = my_metrics_acc,
                       preProcess = c("center", "scale"),
                       tuneGrid = nnet_grid,
                       trControl = my_ctrl_acc,
                       trace = FALSE)
plot(nnet_tune_acc, xTrans=log)

rf_default_acc <- train( outcome ~ .,
                     data = df_scaled,
                     method = 'rf',
                     metric = my_metrics_acc,
                     trControl = my_ctrl_acc,
                     importance = TRUE)
plot( varImp(rf_default_acc) )

xgb_default_acc <- caret::train(outcome ~ ., data = df_scaled,
                            metric = my_metrics_acc,
                            trControl = my_ctrl_acc,
                            method = 'xgbTree',
                            verbosity = 0,
                            nthread = 1)
plot(xgb_default_acc)

xgb_grid <- expand.grid(nrounds = seq(25, 50, by = 25),
                        max_depth = c(3,6,9,12),
                        eta = c(0.25, 0.5, 1) * xgb_default_acc$bestTune$eta,
                        gamma = xgb_default_acc$bestTune$gamma,
                        colsample_bytree = xgb_default_acc$bestTune$colsample_bytree,
                        min_child_weight = xgb_default_acc$bestTune$min_child_weight,
                        subsample = xgb_default_acc$bestTune$subsample)
xgb_tune_acc <- caret::train(outcome ~ ., data = df_scaled,
                            metric = my_metrics_acc,
                            trControl = my_ctrl_acc,
                            method = 'xgbTree',
                            verbosity = 0,
                            tuneGrid = xgb_grid,
                            nthread = 1)
plot(xgb_tune_acc)

plot( varImp(xgb_default_acc) )

plot( varImp(xgb_tune_acc) )

caret_acc_compare <- resamples(
  list(
    glm_default_acc_1 = glm_default_acc$models[[1]],
    glm_default_acc_2 = glm_default_acc$models[[2]],
    glm_default_acc_3 = glm_default_acc$models[[3]],
    glm_default_acc_4 = glm_default_acc$models[[4]],
    enet_default_acc_1 = enet_default_acc$models[[1]],
    enet_default_acc_2 = enet_default_acc$models[[2]],
    enet_tune_acc_1 = enet_tune_acc$models[[1]],
    enet_tune_acc_2 = enet_tune_acc$models[[2]],
    nnet_default_acc = nnet_default_acc,
    nnet_tune_acc = nnet_tune_acc,
    rf_default_acc = rf_default_acc,
    xgb_default_acc = xgb_default_acc,
    xgb_tune_acc = xgb_tune_acc
  )
)
caret_acc_compare %>% dotplot(metric = my_metrics_acc)

acc_models_list <- list(
    glm_default_acc_1 = glm_default_acc$models[[1]],
    glm_default_acc_2 = glm_default_acc$models[[2]],
    glm_default_acc_3 = glm_default_acc$models[[3]],
    glm_default_acc_4 = glm_default_acc$models[[4]],
    enet_default_acc_1 = enet_default_acc$models[[1]],
    enet_default_acc_2 = enet_default_acc$models[[2]],
    enet_tune_acc_1 = enet_tune_acc$models[[1]],
    enet_tune_acc_2 = enet_tune_acc$models[[2]],
    nnet_default_acc = nnet_default_acc,
    nnet_tune_acc = nnet_tune_acc,
    rf_default_acc = rf_default_acc,
    xgb_default_acc = xgb_default_acc,
    xgb_tune_acc = xgb_tune_acc
  )

for(name in names(acc_models_list)) {
  acc_models_list[[name]] %>% readr::write_rds(paste(name, '.rds', sep = ''))
}

Comparing models based on ROC/AUC

my_ctrl_roc <- trainControl(method = 'repeatedcv', number = 5, repeats = 3,
                            summaryFunction = twoClassSummary,
                            classProbs = TRUE,
                            savePredictions = TRUE)

my_metrics_roc <- 'ROC'
glm_default_roc <- tibble::tibble(
  formula = formulas,
  models = purrr::map(formulas, formula_to_glm_model, my_metrics_roc, my_ctrl_roc)
)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
enet_default_roc <- tibble::tibble(
  formula = formulas[2:3],
  models = purrr::map(formula, formula_to_enet_model, my_metrics_roc, my_ctrl_roc)
)
enet_tune_roc <- tibble::tibble(
  formula = formulas[2:3],
  models = purrr::map2(formula, enet_default_roc$models, formula_to_enet_tune_model, my_metrics_roc, my_ctrl_roc)
)
## Warning: from glmnet C++ code (error code -100); Convergence for 100th lambda
## value not reached after maxit=100000 iterations; solutions for larger lambdas
## returned
plot(enet_tune_roc$models[[1]], xTrans = log)

plot(enet_tune_roc$models[[2]], xTrans = log)

plot(varImp(enet_tune_roc$models[[1]]))

plot(varImp(enet_tune_roc$models[[2]]), top = 20)

nnet_default_roc <- train( outcome ~ .,
                       data = df_scaled,
                       method = 'nnet',
                       metric = my_metrics_roc,
                       preProcess = c("center", "scale"),
                       trControl = my_ctrl_roc,
                       trace = FALSE)
nnet_grid <- expand.grid(size = c(5, 10, 20),
                         decay = exp(seq(-6, 2, length.out=11)))
nnet_tune_roc <- train( outcome ~ .,
                       data = df_scaled,
                       method = 'nnet',
                       metric = my_metrics_roc,
                       preProcess = c("center", "scale"),
                       tuneGrid = nnet_grid,
                       trControl = my_ctrl_roc,
                       trace = FALSE)
plot(nnet_tune_roc, xTrans=log)

rf_default_roc <- train( outcome ~ .,
                     data = df_scaled,
                     method = 'rf',
                     metric = my_metrics_roc,
                     trControl = my_ctrl_roc,
                     importance = TRUE)
plot( varImp(rf_default_roc) )

xgb_default_roc <- caret::train(outcome ~ ., data = df_scaled,
                            metric = my_metrics_roc,
                            trControl = my_ctrl_roc,
                            method = 'xgbTree',
                            verbosity = 0,
                            nthread = 1)
plot(xgb_default_roc)

xgb_grid <- expand.grid(nrounds = seq(25, 50, by = 25),
                        max_depth = c(3,6,9,12),
                        eta = c(0.25, 0.5, 1) * xgb_default_roc$bestTune$eta,
                        gamma = xgb_default_roc$bestTune$gamma,
                        colsample_bytree = xgb_default_roc$bestTune$colsample_bytree,
                        min_child_weight = xgb_default_roc$bestTune$min_child_weight,
                        subsample = xgb_default_roc$bestTune$subsample)
xgb_tune_roc <- caret::train(outcome ~ ., data = df_scaled,
                            metric = my_metrics_roc,
                            trControl = my_ctrl_roc,
                            method = 'xgbTree',
                            verbosity = 0,
                            tuneGrid = xgb_grid,
                            nthread = 1)
plot(xgb_tune_roc)

plot( varImp(xgb_default_roc) )

plot( varImp(xgb_tune_roc) )

caret_roc_compare <- resamples(
  list(
    glm_default_roc_1 = glm_default_roc$models[[1]],
    glm_default_roc_2 = glm_default_roc$models[[2]],
    glm_default_roc_3 = glm_default_roc$models[[3]],
    glm_default_roc_4 = glm_default_roc$models[[4]],
    enet_default_roc_1 = enet_default_roc$models[[1]],
    enet_default_roc_2 = enet_default_roc$models[[2]],
    enet_tune_roc_1 = enet_tune_roc$models[[1]],
    enet_tune_roc_2 = enet_tune_roc$models[[2]],
    nnet_default_roc = nnet_default_roc,
    nnet_tune_roc = nnet_tune_roc,
    rf_default_roc = rf_default_roc,
    xgb_default_roc = xgb_default_roc,
    xgb_tune_roc = xgb_tune_roc
  )
)
caret_roc_compare %>% dotplot(metric = my_metrics_roc)

roc_models_list <- list(
    glm_default_roc_1 = glm_default_roc$models[[1]],
    glm_default_roc_2 = glm_default_roc$models[[2]],
    glm_default_roc_3 = glm_default_roc$models[[3]],
    glm_default_roc_4 = glm_default_roc$models[[4]],
    enet_default_roc_1 = enet_default_roc$models[[1]],
    enet_default_roc_2 = enet_default_roc$models[[2]],
    enet_tune_roc_1 = enet_tune_roc$models[[1]],
    enet_tune_roc_2 = enet_tune_roc$models[[2]],
    nnet_default_roc = nnet_default_roc,
    nnet_tune_roc = nnet_tune_roc,
    rf_default_roc = rf_default_roc,
    xgb_default_roc = xgb_default_roc,
    xgb_tune_roc = xgb_tune_roc
  )

for(name in names(roc_models_list)) {
  roc_models_list[[name]] %>% readr::write_rds(paste(name, '.rds', sep = ''))
}